outliers <- read_tsv("../input_data/druggable_outliers_from_treehouse_and_other_cohorts_2023_11_09-13_46_32_2023.tsv") %>%
mutate(high_level_cohort = ifelse(str_detect(comparison_cohort, "Treehouse"),
"Treehouse",
comparison_cohort))
## Rows: 287 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (4): Sample_ID, comparison_cohort, gene, donor_ID
## lgl (1): pathway_support
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
outlier_genes_detected <- unique(outliers$gene)
expr <- read_tsv("../input_data/druggable_TumorCompendium_v11_PolyA_hugo_log2tpm_58581genes_2020-04-09.tsv.gz") %>%
rename(Sample_ID = TH_id) %>%
filter(Gene %in% outlier_genes_detected)
## Rows: 1414917 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): Gene, TH_id
## dbl (1): log2TPM1
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
stanford_samples <- read_tsv("../gather_input_data/comparison_to_non_CARE_cohorts/data/TH03_TH34_rollup.sample_list.txt",
col_names = "Sample_ID") %>%
mutate(cohort = "Stanford")
## Rows: 110 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Sample_ID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
TCGA_samples <- read_tsv("../gather_input_data/comparison_to_non_CARE_cohorts/data/TCGA_rollup.sample_list.txt",
col_names = "Sample_ID") %>%
mutate(cohort = "TCGA")
## Rows: 9806 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Sample_ID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
PEDAYA_samples <- read_tsv("../gather_input_data/comparison_to_non_CARE_cohorts/data/PEDAYA_rollup.sample_list.txt",
col_names = "Sample_ID") %>%
mutate(cohort = "PEDAYA")
## Rows: 2814 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): Sample_ID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pan_cancer_samples <- expr %>%
select(Sample_ID) %>%
distinct() %>%
mutate(cohort = "Treehouse_pancancer")
samples_in_cohorts <- bind_rows(
stanford_samples,
TCGA_samples,
PEDAYA_samples,
pan_cancer_samples)
tabyl(samples_in_cohorts,
cohort)
## cohort n percent
## PEDAYA 2814 0.11045257
## Stanford 110 0.00431762
## TCGA 9806 0.38489618
## Treehouse_pancancer 12747 0.50033363
rsem_path <- "../input_data/non_compendium_expression"
gene_name_conversion <- read_tsv(file.path(rsem_path,
"EnsGeneID_Hugo_Observed_Conversions.txt"))
## Rows: 60498 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): HugoID, EnsGeneID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
relevant_gene_name_conversion <- gene_name_conversion %>%
filter(HugoID %in% outlier_genes_detected)
rsem_kitchen_sink_data <- tibble(file_name = list.files(
path = rsem_path,
pattern = "_rsem_genes.results")) %>%
rowwise() %>%
mutate(rsem_raw = list(read_tsv(file.path(rsem_path, file_name),
show_col_types = FALSE
))) %>%
unnest(rsem_raw) %>%
filter(gene_id %in% relevant_gene_name_conversion$EnsGeneID) %>%
mutate(Sample_ID = str_extract(file_name, "TH[R]?[0-9]{2}_[0-9]{4}_S[0-9]{2}")) %>%
left_join(relevant_gene_name_conversion,
by=c("gene_id"="EnsGeneID")) %>%
group_by(Sample_ID, HugoID) %>%
summarize(sum_TPM = sum(TPM),
n=n()) %>%
mutate(log2TPM1 = log2(sum_TPM +1))
## `summarise()` has grouped output by 'Sample_ID'. You can override using the
## `.groups` argument.
table(rsem_kitchen_sink_data$n)
##
## 1 2
## 275 5
patient_expression_from_rsem_files <- rsem_kitchen_sink_data %>%
select(gene = HugoID,
log2TPM1,
Sample_ID)
patient_expression_from_compendia <- outliers %>%
select(Sample_ID, gene) %>%
distinct() %>%
left_join(expr,
by=c("Sample_ID", "gene"="Gene"))
patient_expression <- bind_rows(
patient_expression_from_rsem_files,
patient_expression_from_compendia)
length(outlier_genes_detected)
## [1] 56
outliers$Sample_ID[ ! outliers$Sample_ID %in% expr$Sample_ID] %>% unique()
## [1] "TH34_1400_S01" "TH34_2292_S01" "TH34_2666_S01" "TH34_1445_S02"
## [5] "TH34_1456_S02"
outliers
patient_expression
## # A tibble: 410 × 3
## # Groups: Sample_ID [34]
## gene log2TPM1 Sample_ID
## <chr> <dbl> <chr>
## 1 AKT1 6.41 TH34_1400_S01
## 2 AKT2 7.55 TH34_1400_S01
## 3 ALK 0.791 TH34_1400_S01
## 4 BCL6 6.68 TH34_1400_S01
## 5 BTK 2.09 TH34_1400_S01
## 6 CCND1 5.10 TH34_1400_S01
## 7 CCND2 3.35 TH34_1400_S01
## 8 CCND3 4.52 TH34_1400_S01
## 9 CCNE1 1.17 TH34_1400_S01
## 10 CDK4 5.63 TH34_1400_S01
## # ℹ 400 more rows
this_gene <- "FGFR4"
one_gene_expr_per_cohort <- bind_rows(
expr %>%
filter(Gene == this_gene,
Sample_ID %in% stanford_samples$Sample_ID) %>%
mutate(cohort = "Stanford"),
expr %>%
filter(Gene == this_gene,
Sample_ID %in% TCGA_samples$Sample_ID) %>%
mutate(cohort = "TCGA"),
expr %>%
filter(Gene == this_gene,
Sample_ID %in% PEDAYA_samples$Sample_ID) %>%
mutate(cohort = "PEDAYA"),
expr %>%
filter(Gene == this_gene,
Sample_ID %in% pan_cancer_samples$Sample_ID) %>%
mutate(cohort = "pan_cancer"))
# How many colors to i need
outliers %>%
group_by(gene) %>%
summarize(n_samples = length(unique(Sample_ID))) %>%
arrange(desc(n_samples))
## # A tibble: 56 × 2
## gene n_samples
## <chr> <int>
## 1 IGF2 18
## 2 HMOX1 8
## 3 NTRK2 7
## 4 FGFR4 5
## 5 ETV1 4
## 6 NTRK3 4
## 7 BTK 3
## 8 CDK9 3
## 9 FGFR1 3
## 10 FLT4 3
## # ℹ 46 more rows
library(khroma)
lapply(outlier_genes_detected, function(this_gene){
# this_gene <- "PIK3R5"
relevant_patient_expression <- patient_expression %>%
filter(gene == this_gene) %>%
filter(Sample_ID %in% (outliers %>%
filter(gene == this_gene) %>%
pull(Sample_ID)))
one_gene_expr_per_cohort <- left_join(samples_in_cohorts,
expr %>%
filter(Gene == this_gene))
p1 <- ggplot(one_gene_expr_per_cohort) +
geom_histogram(aes(x=log2TPM1)) +
geom_vline(data = relevant_patient_expression,
aes(xintercept = log2TPM1,
color = Sample_ID)) +
scale_color_brewer(palette = "Set1") +
facet_col(~cohort, scales = "free_y") +
ggtitle(this_gene)
p2 <- ggplot(one_gene_expr_per_cohort) +
geom_boxplot(aes(x=log2TPM1)) +
geom_vline(data = relevant_patient_expression,
aes(xintercept = log2TPM1,
color = Sample_ID)) +
scale_color_brewer(palette = "Set1") +
facet_col(~cohort)
outlier_table <- outliers %>%
filter(gene == this_gene) %>%
select(Sample_ID, gene, comparison_cohort) %>%
mutate(found = TRUE) %>%
pivot_wider(names_from = comparison_cohort,
values_from = found)
t1 <- tableGrob(outlier_table, theme=ttheme_minimal(), rows=NULL) # transform into a tableGrob
cowplot::plot_grid(p1, p2, t1,
ncol = 1)
})
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8 rows containing missing values (`geom_vline()`).
## Removed 8 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8 rows containing missing values (`geom_vline()`).
## Warning: Removed 8 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Removed 4 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(n, pal): Removed 4 rows containing missing values (`geom_vline()`).
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette Set1 is 9
## Returning the palette you asked for with that many colors
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Removed 4 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8 rows containing missing values (`geom_vline()`).
## Warning: Removed 8 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8 rows containing missing values (`geom_vline()`).
## Removed 8 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Removed 4 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Removed 4 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8 rows containing missing values (`geom_vline()`).
## Warning: Removed 8 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Joining, by = "Sample_ID"
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (`geom_vline()`).
## Removed 4 rows containing missing values (`geom_vline()`).
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
##
## [[27]]
##
## [[28]]
##
## [[29]]
##
## [[30]]
##
## [[31]]
##
## [[32]]
##
## [[33]]
##
## [[34]]
##
## [[35]]
##
## [[36]]
##
## [[37]]
##
## [[38]]
##
## [[39]]
##
## [[40]]
##
## [[41]]
##
## [[42]]
##
## [[43]]
##
## [[44]]
##
## [[45]]
##
## [[46]]
##
## [[47]]
##
## [[48]]
##
## [[49]]
##
## [[50]]
##
## [[51]]
##
## [[52]]
##
## [[53]]
##
## [[54]]
##
## [[55]]
##
## [[56]]